from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from items import FirstscrapyItem


__author__ = 'hexun'
import scrapy
from scrapy import cmdline


class DmozSpider(scrapy.Spider):
    name = "dmoz"
    allowed_domains = ["dmoztools.net"]
    start_urls = [
        "http://dmoztools.net/Computers/Programming/Languages/Python/Resources/",
        "http://dmoztools.net/Computers/Programming/Languages/Python/Books/"
    ]

    rules = (
        # 鎻愬彇鍖归厤 'category.php' (浣嗕笉鍖归厤 'subsection.php') 鐨勯摼鎺ュ苟璺熻繘閾炬帴(娌℃湁callback鎰忓懗鐫�follow榛樿涓篢rue)
        #Rule(LinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),

        # 鎻愬彇鍖归厤 'item.php' 鐨勯摼鎺ュ苟浣跨敤spider鐨刾arse_item鏂规硶杩涜鍒嗘瀽
        #Rule(LinkExtractor(allow=('item\.php', )), callback='parse_item'),
        Rule(LinkExtractor(allow=('item\.php', )), callback='parse_item'),
    )

    def parse_item(self, response):
        self.log('Hi, this is an item page! %s' % response.url)

        item = scrapy.Item()
        item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
        item['name'] = response.xpath('//td[@id="item_name"]/text()').extract()
        item['description'] = response.xpath('//td[@id="item_description"]/text()').extract()
        return item

    def parse(self, response):
        '''
        filename = response.url.split("/")[-2]
        with open(filename , 'wb') as f:
            f.write(response.body)
        '''

        for sel in response.css("#site-list-content .site-item"):
            item = FirstscrapyItem()
            title = sel.css('.title-and-desc a div::text').extract()
            link = sel.css('.title-and-desc').xpath("a/@href").extract()
            desc = sel.css('.title-and-desc .site-descr').xpath('text()').extract()
            item['title'] = title
            item['link'] = link
            item['desc'] = desc
            yield item



if __name__ == '__main__':
    cmdline.execute("scrapy crawl dmoz  -o dmoz.json".split())